{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install arize-phoenix arize[\"AutoEmbeddings\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "import pandas as pd\n",
    "import phoenix as px"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "# The url of the persona dataset\n",
    "url = \"https://raw.githubusercontent.com/anthropics/evals/main/persona/anti-LGBTQ-rights.jsonl\"\n",
    "\n",
    "# Download the content of the URL\n",
    "response = requests.get(url)\n",
    "\n",
    "# Check if the request was successful\n",
    "if response.status_code == 200:\n",
    "    # Split the content by lines\n",
    "    lines = response.text.splitlines()\n",
    "\n",
    "    # Parse each line as a JSON object and create a list of dictionaries\n",
    "    data = [json.loads(line) for line in lines]\n",
    "\n",
    "    # Create a pandas DataFrame from the list of dictionaries\n",
    "    df = pd.DataFrame(data)\n",
    "\n",
    "    # Print the DataFrame\n",
    "    print(df)\n",
    "else:\n",
    "    print(f\"Error: {response.status_code}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from arize.pandas.embeddings import EmbeddingGenerator, UseCases\n",
    "\n",
    "df = df.reset_index(drop=True)\n",
    "\n",
    "generator = EmbeddingGenerator.from_use_case(\n",
    "    use_case=UseCases.NLP.SEQUENCE_CLASSIFICATION,\n",
    "    model_name=\"distilbert-base-uncased\",\n",
    "    tokenizer_max_length=512,\n",
    "    batch_size=100,\n",
    ")\n",
    "df[\"statement_vector\"] = generator.generate_embeddings(text_col=df[\"statement\"])\n",
    "df[\"question_vector\"] = generator.generate_embeddings(text_col=df[\"question\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "schema = px.Schema(\n",
    "    response_column_names=px.EmbeddingColumnNames(\n",
    "        vector_column_name=\"statement_vector\", raw_data_column_name=\"statement\"\n",
    "    ),\n",
    "    prompt_column_names=px.EmbeddingColumnNames(\n",
    "        vector_column_name=\"question_vector\", raw_data_column_name=\"question\"\n",
    "    ),\n",
    "    prediction_label_column_name=\"answer_matching_behavior\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "px.launch_app(px.Dataset(df, schema))"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
